library(ISLR2) # Para el conjunto de datos Hitters
library(leaps) # Para regsubsets()
library(glmnet) # Para Ridge y Lasso
12 Selección del modelo. Regre RIDGE y LASSO
13 Introducción
Este documento explora diversas técnicas para la selección del “mejor” modelo de regresión lineal. Se abordan dos enfoques principales: 1. Criterios de Bondad de Ajuste, que penalizan la complejidad del modelo (AIC, BIC, \(R^2\) ajustado). 2. Criterios de Habilidad Predictiva, que evalúan el rendimiento del modelo en datos no vistos (validación simple y validación cruzada).
Finalmente, se introducen los métodos de regularización Ridge y Lasso como alternativas para manejar la multicolinealidad y realizar selección de variables de forma automática.
13.0.1 Librerías Requeridas
14 1. Bondad de Ajuste (Selección del Mejor Subconjunto de Variables)
14.1 a. Lectura y Limpieza de Datos
Se utiliza el conjunto de datos Hitters
, eliminando las observaciones con valores faltantes en la variable Salary
.
head(Hitters)
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun
-Andy Allanson 293 66 1 30 29 14 1 293 66 1
-Alan Ashby 315 81 7 24 38 39 14 3449 835 69
-Alvin Davis 479 130 18 66 72 76 3 1624 457 63
-Andre Dawson 496 141 20 65 78 37 11 5628 1575 225
-Andres Galarraga 321 87 10 39 42 30 2 396 101 12
-Alfredo Griffin 594 169 4 74 51 35 11 4408 1133 19
CRuns CRBI CWalks League Division PutOuts Assists Errors
-Andy Allanson 30 29 14 A E 446 33 20
-Alan Ashby 321 414 375 N W 632 43 10
-Alvin Davis 224 266 263 A W 880 82 14
-Andre Dawson 828 838 354 N E 200 11 3
-Andres Galarraga 48 46 33 N E 805 40 4
-Alfredo Griffin 501 336 194 A W 282 421 25
Salary NewLeague
-Andy Allanson NA A
-Alan Ashby 475.0 N
-Alvin Davis 480.0 A
-Andre Dawson 500.0 N
-Andres Galarraga 91.5 N
-Alfredo Griffin 750.0 A
sum(is.na(Hitters$Salary))
[1] 59
<- na.omit(Hitters)
Hitters dim(Hitters)
[1] 263 20
14.2 b. Encontrar los Mejores Modelos para cada Cantidad de Variables
La función regsubsets
encuentra el mejor subconjunto de predictores para cada tamaño de modelo (de 1 predictor, de 2 predictores, etc.), donde “mejor” se define como el que minimiza la Suma de Cuadrados Residual (SCR) o equivalentemente, maximiza el \(R^2\).
<- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
regfit.full <- summary(regfit.full)
reg.summary names(reg.summary)
[1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
14.3 c. Definir el Criterio de Bondad de Ajuste
Se evalúan diferentes criterios para seleccionar el tamaño óptimo del modelo. Un buen criterio equilibra el ajuste (SCR bajo) con la parsimonia (pocas variables).
- \(R^2\) Ajustado: \(R^2_{adj} = 1 - \frac{SCR/(n-p)}{SCT/(n-1)}\)
- \(C_p\) de Mallows: \(C_p = \frac{1}{n}(SCR + 2p\hat{\sigma}^2)\)
- BIC: \(BIC = \frac{1}{n}(SCR + \log(n)p\hat{\sigma}^2)\)
Se busca maximizar el \(R^2\) ajustado o minimizar \(C_p\) y BIC.
par(mfrow = c(2, 2))
# Suma de Cuadrados Residual (RSS)
plot(reg.summary$rss, xlab = "Número de Variables", ylab = "SCR", type = "l")
# R^2 Ajustado
plot(reg.summary$adjr2, xlab = "Número de Variables", ylab = "R2 Ajustado", type = "l")
which.max(reg.summary$adjr2)
[1] 11
points(11, reg.summary$adjr2[11], col = "red", cex = 2, pch = 20)
# Cp de Mallows
plot(reg.summary$cp, xlab = "Número de Variables", ylab = "Cp", type = "l")
which.min(reg.summary$cp)
[1] 10
points(10, reg.summary$cp[10], col = "red", cex = 2, pch = 20)
# BIC
plot(reg.summary$bic, xlab = "Número de Variables", ylab = "BIC", type = "l")
which.min(reg.summary$bic)
[1] 6
points(6, reg.summary$bic[6], col = "red", cex = 2, pch = 20)
14.3.1 Visualización de las variables para cada criterio
# Para ver los gráficos
plot(regfit.full, scale = "r2")
plot(regfit.full, scale = "adjr2")
plot(regfit.full, scale = "Cp")
plot(regfit.full, scale = "bic")
Basado en el criterio BIC, el mejor modelo es el que tiene 6 variables.
# Coeficientes del mejor modelo con 6 variables según BIC
coef(regfit.full, 6)
(Intercept) AtBat Hits Walks CRBI DivisionW
91.5117981 -1.8685892 7.6043976 3.6976468 0.6430169 -122.9515338
PutOuts
0.2643076
14.4 d. Regresión Forward y Backward
Son métodos computacionalmente más eficientes que la selección por mejor subconjunto, aunque no garantizan encontrar el mejor modelo absoluto.
# Forward Selection
<- regsubsets(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
regfit.fwd summary(regfit.fwd)
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "forward")
19 Variables (and intercept)
Forced in Forced out
AtBat FALSE FALSE
Hits FALSE FALSE
HmRun FALSE FALSE
Runs FALSE FALSE
RBI FALSE FALSE
Walks FALSE FALSE
Years FALSE FALSE
CAtBat FALSE FALSE
CHits FALSE FALSE
CHmRun FALSE FALSE
CRuns FALSE FALSE
CRBI FALSE FALSE
CWalks FALSE FALSE
LeagueN FALSE FALSE
DivisionW FALSE FALSE
PutOuts FALSE FALSE
Assists FALSE FALSE
Errors FALSE FALSE
NewLeagueN FALSE FALSE
1 subsets of each size up to 19
Selection Algorithm: forward
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " " "*"
2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
4 ( 1 ) " " "*" " " " " " " " " " " " " " " " " " " "*"
5 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " "*"
6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " " " "*"
8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " "*" " " " " " "
4 ( 1 ) " " " " "*" "*" " " " " " "
5 ( 1 ) " " " " "*" "*" " " " " " "
6 ( 1 ) " " " " "*" "*" " " " " " "
7 ( 1 ) "*" " " "*" "*" " " " " " "
8 ( 1 ) "*" " " "*" "*" " " " " " "
9 ( 1 ) "*" " " "*" "*" " " " " " "
10 ( 1 ) "*" " " "*" "*" "*" " " " "
11 ( 1 ) "*" "*" "*" "*" "*" " " " "
12 ( 1 ) "*" "*" "*" "*" "*" " " " "
13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
# Backward Selection
<- regsubsets(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
regfit.bwd summary(regfit.bwd)
Subset selection object
Call: regsubsets.formula(Salary ~ ., data = Hitters, nvmax = 19, method = "backward")
19 Variables (and intercept)
Forced in Forced out
AtBat FALSE FALSE
Hits FALSE FALSE
HmRun FALSE FALSE
Runs FALSE FALSE
RBI FALSE FALSE
Walks FALSE FALSE
Years FALSE FALSE
CAtBat FALSE FALSE
CHits FALSE FALSE
CHmRun FALSE FALSE
CRuns FALSE FALSE
CRBI FALSE FALSE
CWalks FALSE FALSE
LeagueN FALSE FALSE
DivisionW FALSE FALSE
PutOuts FALSE FALSE
Assists FALSE FALSE
Errors FALSE FALSE
NewLeagueN FALSE FALSE
1 subsets of each size up to 19
Selection Algorithm: backward
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns CRBI
1 ( 1 ) " " " " " " " " " " " " " " " " " " " " "*" " "
2 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
3 ( 1 ) " " "*" " " " " " " " " " " " " " " " " "*" " "
4 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " "*" " "
5 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
6 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
7 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" " "
8 ( 1 ) "*" "*" " " " " " " "*" " " " " " " " " "*" "*"
9 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
10 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
11 ( 1 ) "*" "*" " " " " " " "*" " " "*" " " " " "*" "*"
12 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
13 ( 1 ) "*" "*" " " "*" " " "*" " " "*" " " " " "*" "*"
14 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" " " " " "*" "*"
15 ( 1 ) "*" "*" "*" "*" " " "*" " " "*" "*" " " "*" "*"
16 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
17 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*" " " "*" "*"
18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*"
19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
CWalks LeagueN DivisionW PutOuts Assists Errors NewLeagueN
1 ( 1 ) " " " " " " " " " " " " " "
2 ( 1 ) " " " " " " " " " " " " " "
3 ( 1 ) " " " " " " "*" " " " " " "
4 ( 1 ) " " " " " " "*" " " " " " "
5 ( 1 ) " " " " " " "*" " " " " " "
6 ( 1 ) " " " " "*" "*" " " " " " "
7 ( 1 ) "*" " " "*" "*" " " " " " "
8 ( 1 ) "*" " " "*" "*" " " " " " "
9 ( 1 ) "*" " " "*" "*" " " " " " "
10 ( 1 ) "*" " " "*" "*" "*" " " " "
11 ( 1 ) "*" "*" "*" "*" "*" " " " "
12 ( 1 ) "*" "*" "*" "*" "*" " " " "
13 ( 1 ) "*" "*" "*" "*" "*" "*" " "
14 ( 1 ) "*" "*" "*" "*" "*" "*" " "
15 ( 1 ) "*" "*" "*" "*" "*" "*" " "
16 ( 1 ) "*" "*" "*" "*" "*" "*" " "
17 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
18 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
19 ( 1 ) "*" "*" "*" "*" "*" "*" "*"
# Comparación del mejor modelo con 7 variables
coef(regfit.full, 7) # Mejor subset
(Intercept) Hits Walks CAtBat CHits CHmRun
79.4509472 1.2833513 3.2274264 -0.3752350 1.4957073 1.4420538
DivisionW PutOuts
-129.9866432 0.2366813
coef(regfit.fwd, 7) # Forward
(Intercept) AtBat Hits Walks CRBI CWalks
109.7873062 -1.9588851 7.4498772 4.9131401 0.8537622 -0.3053070
DivisionW PutOuts
-127.1223928 0.2533404
coef(regfit.bwd, 7) # Backward
(Intercept) AtBat Hits Walks CRuns CWalks
105.6487488 -1.9762838 6.7574914 6.0558691 1.1293095 -0.7163346
DivisionW PutOuts
-116.1692169 0.3028847
15 2. Habilidad Predictiva (Validación Simple)
Este enfoque evalúa el rendimiento del modelo en un conjunto de datos de prueba que no se utilizó para el entrenamiento. El objetivo es minimizar el Error Cuadrático Medio (ECM) de predicción.
\[ ECM = \frac{1}{n_{test}} \sum_{i \in test} (y_i - \hat{y}_i)^2 \]
15.1 a. Dividir los datos en Entrenamiento (70%) y Prueba (30%)
set.seed(1)
<- sample(1:nrow(Hitters), size = 0.7 * nrow(Hitters))
train_indices <- rep(FALSE, nrow(Hitters))
train <- TRUE
train[train_indices] <- !train
test
# Entrenar los modelos usando solo los datos de entrenamiento
<- regsubsets(Salary ~ ., data = Hitters[train, ], nvmax = 19)
regfit.best
# Crear la matriz de diseño para el conjunto de prueba
<- model.matrix(Salary ~ ., data = Hitters[test, ]) test.mat
15.2 b. Calcular el Error de Predicción en el Conjunto de Prueba
<- rep(NA, 19)
val.errors for (i in 1:19) {
<- coef(regfit.best, id = i)
coefi <- test.mat[, names(coefi)] %*% coefi
pred <- mean((Hitters$Salary[test] - pred)^2)
val.errors[i]
}
# El modelo con 7 variables minimiza el ECM en el conjunto de prueba
which.min(val.errors)
[1] 12
coef(regfit.best, 7)
(Intercept) Walks CAtBat CHits CHmRun DivisionW
111.2705546 4.0681847 -0.4565451 1.8743517 1.6316479 -154.3160135
PutOuts Assists
0.2798340 0.2470505
16 3. Validación Cruzada (k-fold Cross-Validation)
La validación cruzada es un método más robusto para estimar el error de predicción, especialmente con muestras de tamaño moderado. Los datos se dividen en k pliegues (folds), y el modelo se entrena k veces, usando en cada ocasión un pliegue diferente como conjunto de prueba y el resto como entrenamiento.
16.1 Creación de los Folds (k=10)
<- 10
k <- nrow(Hitters)
n set.seed(1)
<- sample(rep(1:k, length = n))
folds <- matrix(NA, k, 19, dimnames = list(NULL, paste(1:19))) cv.errors
16.2 Entrenar y Evaluar los Modelos en cada Fold
# Función predict para objetos regsubsets
<- function(object, newdata, id, ...) {
predict.regsubsets <- as.formula(object$call[[2]])
form <- model.matrix(form, newdata)
mat <- coef(object, id = id)
coefi <- names(coefi)
xvars %*% coefi
mat[, xvars]
}
for (j in 1:k) {
<- regsubsets(Salary ~ .,
best.fit data = Hitters[folds != j, ],
nvmax = 19)
for (i in 1:19) {
<- predict(best.fit, Hitters[folds == j, ], id = i)
pred <- mean((Hitters$Salary[folds == j] - pred)^2)
cv.errors[j, i]
} }
16.3 Resumir los Errores y Seleccionar el Mejor Modelo
Se promedia el ECM a través de los k pliegues para cada tamaño de modelo.
<- apply(cv.errors, 2, mean)
mean.cv.errors plot(mean.cv.errors, type = "b")
# El modelo con 10 variables minimiza el ECM promedio de validación cruzada
which.min(mean.cv.errors)
10
10
# Coeficientes del modelo final, re-entrenado con todos los datos
<- regsubsets(Salary ~ ., data = Hitters, nvmax = 19)
reg.best.final coef(reg.best.final, 10)
(Intercept) AtBat Hits Walks CAtBat CRuns
162.5354420 -2.1686501 6.9180175 5.7732246 -0.1300798 1.4082490
CRBI CWalks DivisionW PutOuts Assists
0.7743122 -0.8308264 -112.3800575 0.2973726 0.2831680
17 4. Regresión Ridge y Lasso
Son métodos de regularización que penalizan la magnitud de los coeficientes para prevenir el sobreajuste y manejar la multicolinealidad.
- Ridge: Minimiza \(SCR + \lambda \sum \beta_j^2\). Encoge los coeficientes pero no los hace exactamente cero.
- Lasso: Minimiza \(SCR + \lambda \sum |\beta_j|\). Puede encoger coeficientes hasta hacerlos exactamente cero, realizando selección de variables.
# Se preparan la matriz de predictores X y el vector de respuesta y
<- model.matrix(Salary ~ ., Hitters)[, -1]
x <- Hitters$Salary y
17.1 Ridge Regression (\(\alpha=0\))
# Secuencia de búsqueda para el parámetro de penalización lambda
<- 10^seq(10, -2, length = 100)
grid
# Ajuste del modelo Ridge
<- glmnet(x, y, alpha = 0, lambda = grid)
ridge.mod
# Determinación del lambda óptimo por validación cruzada
set.seed(1)
<- cv.glmnet(x[train, ], y[train], alpha = 0)
cv.out # plot(cv.out) # Descomentar para ver el gráfico en sesión interactiva
<- cv.out$lambda.min
bestlam_ridge bestlam_ridge
[1] 167.7025
# Evaluación del ECM en el conjunto de prueba
<- predict(ridge.mod, s = bestlam_ridge, newx = x[test, ])
ridge.pred mean((ridge.pred - y[test])^2)
[1] 119774.7
# Coeficientes finales del modelo Ridge, re-entrenado con todos los datos
<- glmnet(x, y, alpha = 0)
out_ridge predict(out_ridge, type = "coefficients", s = bestlam_ridge)
20 x 1 sparse Matrix of class "dgCMatrix"
s1
(Intercept) 10.906017127
AtBat -0.003941931
Hits 1.107049069
HmRun -0.130566480
Runs 1.134746101
RBI 0.866683100
Walks 1.911183206
Years -0.696009116
CAtBat 0.010861092
CHits 0.069735913
CHmRun 0.478744948
CRuns 0.138326679
CRBI 0.147426144
CWalks 0.011038802
LeagueN 30.064959596
DivisionW -97.672565744
PutOuts 0.203950143
Assists 0.051690395
Errors -2.068158795
NewLeagueN 5.455166167
17.2 Regresión Lasso (\(\alpha=1\))
# Ajuste del modelo Lasso
<- glmnet(x[train, ], y[train], alpha = 1, lambda = grid)
lasso.mod # plot(lasso.mod) # Gráfico de la trayectoria de los coeficientes
# Determinación del lambda óptimo por validación cruzada
set.seed(1)
<- cv.glmnet(x[train, ], y[train], alpha = 1)
cv.out # plot(cv.out)
<- cv.out$lambda.min
bestlam_lasso bestlam_lasso
[1] 19.28171
# Evaluación del ECM en el conjunto de prueba
<- predict(lasso.mod, s = bestlam_lasso, newx = x[test, ])
lasso.pred mean((lasso.pred - y[test])^2)
[1] 134796.5
# Coeficientes finales del modelo Lasso, re-entrenado con todos los datos
<- glmnet(x, y, alpha = 1, lambda = grid)
out_lasso <- predict(out_lasso, type = "coefficients", s = bestlam_lasso)
lasso.coef != 0] # Muestra solo las variables que no fueron eliminadas lasso.coef[lasso.coef
[1] 25.7481900 1.8453712 2.1895287 0.2053397 0.4088308 -98.9878912
[7] 0.2144732